import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import xgboost as xgb
import pickle
import time
import datetime
import warnings
'ignore')
warnings.filterwarnings(
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
FAURD코드
def down_sample_textbook(df):
= df[df.is_fraud==0].copy()
df_majority = df[df.is_fraud==1].copy()
df_minority = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
df_maj_dowsampled = pd.concat([df_minority, df_maj_dowsampled])
df_downsampled return df_downsampled
def compute_time_difference(group):
= len(group)
n = []
result for i in range(n):
for j in range(n):
= abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
time_difference
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])return result
def mask(df):
= sklearn.model_selection.train_test_split(df, random_state=42)
df_tr,df_test = len(df)
N = [i in df_tr.index for i in range(N)]
train_mask = [i in df_test.index for i in range(N)]
test_mask = np.array(train_mask)
train_mask = np.array(test_mask)
test_mask return train_mask, test_mask
def edge_index_selected(edge_index):
= edge_index[:,2].mean()
theta 2] = (np.exp(-edge_index[:,2]/theta) != 1)*(np.exp(-edge_index[:,2]/theta))
edge_index[:,= edge_index.tolist()
edge_index = np.array(edge_index)[:,2].mean()
mean_ = [(int(row[0]), int(row[1])) for row in edge_index if row[2] > mean_]
selected_edges = torch.tensor(selected_edges, dtype=torch.long).t()
edge_index_selected return edge_index_selected
with open('../fraudTrain.pkl', 'rb') as file:
= pickle.load(file) fraudTrain
fraudTrain
trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | city | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-01-01 00:00:00 | 2.703190e+15 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | Moravian Falls | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
1 | 2019-01-01 00:00:00 | 6.304230e+11 | fraud_Heller, Gutmann and Zieme | grocery_pos | 107.23 | Stephanie | Gill | F | 43039 Riley Greens Suite 393 | Orient | ... | 48.8878 | -118.2105 | 149 | Special educational needs teacher | 1978-06-21 | 1f76529f8574734946361c461b024d99 | 1325376044 | 49.159047 | -118.186462 | 0 |
2 | 2019-01-01 00:00:00 | 3.885950e+13 | fraud_Lind-Buckridge | entertainment | 220.11 | Edward | Sanchez | M | 594 White Dale Suite 530 | Malad City | ... | 42.1808 | -112.2620 | 4154 | Nature conservation officer | 1962-01-19 | a1a22d70485983eac12b5b88dad1cf95 | 1325376051 | 43.150704 | -112.154481 | 0 |
3 | 2019-01-01 00:01:00 | 3.534090e+15 | fraud_Kutch, Hermiston and Farrell | gas_transport | 45.00 | Jeremy | White | M | 9443 Cynthia Court Apt. 038 | Boulder | ... | 46.2306 | -112.1138 | 1939 | Patent attorney | 1967-01-12 | 6b849c168bdad6f867558c3793159a81 | 1325376076 | 47.034331 | -112.561071 | 0 |
4 | 2019-01-01 00:03:00 | 3.755340e+14 | fraud_Keeling-Crist | misc_pos | 41.96 | Tyler | Garcia | M | 408 Bradley Rest | Doe Hill | ... | 38.4207 | -79.4629 | 99 | Dance movement psychotherapist | 1986-03-28 | a41d7549acf90789359a9aa5346dcb46 | 1325376186 | 38.674999 | -78.632459 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1048570 | 2020-03-10 16:07:00 | 6.011980e+15 | fraud_Fadel Inc | health_fitness | 77.00 | Haley | Wagner | F | 05561 Farrell Crescent | Annapolis | ... | 39.0305 | -76.5515 | 92106 | Accountant, chartered certified | 1943-05-28 | 45ecd198c65e81e597db22e8d2ef7361 | 1362931649 | 38.779464 | -76.317042 | 0 |
1048571 | 2020-03-10 16:07:00 | 4.839040e+15 | fraud_Cremin, Hamill and Reichel | misc_pos | 116.94 | Meredith | Campbell | F | 043 Hanson Turnpike | Hedrick | ... | 41.1826 | -92.3097 | 1583 | Geochemist | 1999-06-28 | c00ce51c6ebb7657474a77b9e0b51f34 | 1362931670 | 41.400318 | -92.726724 | 0 |
1048572 | 2020-03-10 16:08:00 | 5.718440e+11 | fraud_O'Connell, Botsford and Hand | home | 21.27 | Susan | Mills | F | 005 Cody Estates | Louisville | ... | 38.2507 | -85.7476 | 736284 | Engineering geologist | 1952-04-02 | 17c9dc8b2a6449ca2473726346e58e6c | 1362931711 | 37.293339 | -84.798122 | 0 |
1048573 | 2020-03-10 16:08:00 | 4.646850e+18 | fraud_Thompson-Gleason | health_fitness | 9.52 | Julia | Bell | F | 576 House Crossroad | West Sayville | ... | 40.7320 | -73.1000 | 4056 | Film/video editor | 1990-06-25 | 5ca650881b48a6a38754f841c23b77ab | 1362931718 | 39.773077 | -72.213209 | 0 |
1048574 | 2020-03-10 16:08:00 | 2.283740e+15 | fraud_Buckridge PLC | misc_pos | 6.81 | Shannon | Williams | F | 9345 Spencer Junctions Suite 183 | Alpharetta | ... | 34.0770 | -84.3033 | 165556 | Prison officer | 1997-12-27 | 8d0a575fe635bbde12f1a2bffc126731 | 1362931730 | 33.601468 | -83.891921 | 0 |
1048575 rows × 22 columns
%run ../function_proposed_gcn.py
%run ../functions-book.py
데이터정리
= throw(fraudTrain, 0.5) df50
= mask(df50) train_mask, test_mask
책(신용카드 거래에 대한 그래프 분석)
-
이분그래프
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
mapping"merchant"].values.tolist()))}
df[
"from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점
df[
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df "is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
df[
=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
G
int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부
nx.set_edge_attributes(G, {(
int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
nx.set_edge_attributes(G,{(
return G
= build_graph_bipartite(df50, nx.Graph(name="Bipartite Undirect")) G_bu
-
삼분그래프
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() +
mapping"cc_num"].values.tolist() +
df["merchant"].values.tolist()))}
df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
df[
=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
G"out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
[(x[
"in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x[
"out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x[
"in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
"out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
return G
- 판매자, 고객, 거래에 노드 할당
= build_graph_tripartite(df50, nx.Graph()) G_tu
사기 탐지를 위한 지도 및 비지도 임베딩
지도학습
import networkx as nx
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
"""
Build a bipartite graph from the input dataframe.
Parameters:
df_input (DataFrame): Input dataframe containing transaction information.
graph_type (networkx graph type, optional): Type of graph to create. Defaults to nx.Graph().
Returns:
networkx.Graph: Bipartite graph.
"""
= df_input.copy()
df = {x: node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist() + df["merchant"].values.tolist()))}
mapping
"from"] = df["cc_num"].apply(lambda x: mapping[x]) # 엣지의 출발점
df["to"] = df["merchant"].apply(lambda x: mapping[x]) # 엣지의 도착점
df[
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from', 'to']).agg({"is_fraud":"sum", "amt":"sum"}).reset_index()
df "is_fraud"] = df["is_fraud"].apply(lambda x: 1 if x > 0 else 0)
df[
= nx.from_edgelist(df[["from", "to"]].values, create_using=graph_type)
G
int(x["from"]), int(x["to"])): x["is_fraud"] for idx, x in df[["from", "to", "is_fraud"]].iterrows()}, "label") # 엣지 속성 설정, 각 속성의 사기 여부
nx.set_edge_attributes(G, {(
int(x["from"]), int(x["to"])): x["amt"] for idx, x in df[["from", "to", "amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
nx.set_edge_attributes(G, {(
return G
def train_and_evaluate_node2vec(df, embedding_dimension=128, test_size=0.2, random_state=42):
"""
Train and evaluate node2vec embeddings with a Random Forest classifier.
Parameters:
df (DataFrame): Input dataframe containing transaction information.
embedding_dimension (int, optional): Dimension of node embeddings. Defaults to 128.
test_size (float, optional): Proportion of the dataset to include in the test split. Defaults to 0.2.
random_state (int, optional): Seed used by the random number generator. Defaults to 42.
Returns:
dict: Dictionary containing evaluation metrics.
"""
= build_graph_bipartite(df)
G
= train_test_split(list(range(len(G.edges))),
train_edges, test_edges, train_labels, y list(nx.get_edge_attributes(G, "label").values()))
= list(G.edges)
edgs = G.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G.nodes) - set(train_graph.nodes)))
train_graph.add_nodes_from(
= Node2Vec(train_graph, dimensions=embedding_dimension, weight_key='weight')
node2vec_train = node2vec_train.fit(window=10)
model_train
= [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes = {}
evaluation_results
for cl in classes:
= cl(keyed_vectors=model_train.wv)
embeddings_train
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
test_embeddings
= RandomForestClassifier(n_estimators=1000, random_state=random_state)
rf
rf.fit(train_embeddings, train_labels)
= rf.predict(test_embeddings)
yhat = metrics.accuracy_score(y, yhat)
acc = metrics.precision_score(y, yhat)
pre = metrics.recall_score(y, yhat)
rec = metrics.f1_score(y, yhat)
f1 = metrics.roc_auc_score(y, yhat)
auc
__name__] = {"accuracy": acc, "precision": pre, "recall": rec, "f1-score": f1, "auc": auc}
evaluation_results[cl.
return evaluation_results
# Example usage:
# evaluation_results = train_and_evaluate_node2vec(df50)
# print(evaluation_results)
= train_and_evaluate_node2vec(df50)
evaluation_results print(evaluation_results)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00, 2.55it/s]
{'HadamardEmbedder': {'accuracy': 0.5240514905149052, 'precision': 0.7365591397849462, 'recall': 0.09176155391828533, 'f1-score': 0.16319237641453246, 'auc': 0.5290884534498898}, 'AverageEmbedder': {'accuracy': 0.7130758807588076, 'precision': 0.7094682230869002, 'recall': 0.7327528466175486, 'f1-score': 0.7209225700164745, 'auc': 0.7128466083670335}, 'WeightedL1Embedder': {'accuracy': 0.49390243902439024, 'precision': 0.4857142857142857, 'recall': 0.011386470194239785, 'f1-score': 0.02225130890052356, 'auc': 0.4995246264610679}, 'WeightedL2Embedder': {'accuracy': 0.49762872628726285, 'precision': 0.6190476190476191, 'recall': 0.017414601473543203, 'f1-score': 0.033876221498371335, 'auc': 0.5032240930602808}}
evaluation_results
{'HadamardEmbedder': {'accuracy': 0.5240514905149052,
'precision': 0.7365591397849462,
'recall': 0.09176155391828533,
'f1-score': 0.16319237641453246,
'auc': 0.5290884534498898},
'AverageEmbedder': {'accuracy': 0.7130758807588076,
'precision': 0.7094682230869002,
'recall': 0.7327528466175486,
'f1-score': 0.7209225700164745,
'auc': 0.7128466083670335},
'WeightedL1Embedder': {'accuracy': 0.49390243902439024,
'precision': 0.4857142857142857,
'recall': 0.011386470194239785,
'f1-score': 0.02225130890052356,
'auc': 0.4995246264610679},
'WeightedL2Embedder': {'accuracy': 0.49762872628726285,
'precision': 0.6190476190476191,
'recall': 0.017414601473543203,
'f1-score': 0.033876221498371335,
'auc': 0.5032240930602808}}
= train_test_split(list(range(len(G_bu.edges))),
train_edges, test_edges, train_labels, y list(nx.get_edge_attributes(G_bu, "label").values()))
np.array(train_labels).shape
(8854,)
np.array(y).shape
(2952,)
yhat.mean()
0.0
def try_book(fraudTrain, fraudrate, n, prev_results=None):
if prev_results is None:
= pd.DataFrame(columns=[
df_results 'model', 'time', 'acc', 'pre', 'rec', 'f1', 'auc', 'graph_based',
'method', 'throw_rate', 'train_size', 'train_cols', 'train_frate',
'test_size', 'test_frate', 'hyper_params'
])else:
= prev_results
df_results
= throw(fraudTrain, fraudrate)
dfrate = sklearn.model_selection.train_test_split(dfrate)
df_tr, df_tst
= fraudTrain[::n]
dfn = dfn[~dfn.index.isin(df_tr.index)]
dfnn = dfnn.reset_index(drop=True)
dfnn = sklearn.model_selection.train_test_split(dfnn)
df_trn, df_tstn
= concat(df_tr, df_tstn)
df2, mask 'index'] = df2.index
df2[= df2.reset_index()
df
= build_graph_tripartite(df, nx.Graph())
G_df
= train_test_split(list(range(len(G_df.edges))),
train_edges, test_edges, train_labels, y list(nx.get_edge_attributes(G_df, "label").values()),
=0.20,
test_size=42)
random_state
= list(G_df.edges)
edgs = G_df.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_df.nodes) - set(train_graph.nodes)))
train_graph.add_nodes_from(
= Node2Vec(train_graph, weight_key='weight')
node2vec_train = node2vec_train.fit(window=10)
model_train
#classes = [HadamardEmbedder]#, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
#evaluation_results = {}
= HadamardEmbedder(keyed_vectors=model_train.wv)
embeddings_train
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
test_embeddings
= RandomForestClassifier(n_estimators=1000, random_state=42)
rf
rf.fit(train_embeddings, train_labels)
= rf.predict(test_embeddings)
yhat = metrics.accuracy_score(y, yhat)
acc = metrics.precision_score(y, yhat)
pre = metrics.recall_score(y, yhat)
rec = metrics.f1_score(y, yhat)
f1 = metrics.roc_auc_score(y, yhat)
auc
= {
result 'model': 'bipartite',
'time': None,
'acc': acc,
'pre': pre,
'rec': rec,
'f1': f1,
'auc': auc,
'graph_based': True,
'method': "Hadaembedder",
'throw_rate': df.is_fraud.mean(),
'train_size': len(train_labels),
'train_cols': 'amt',
'train_frate': np.array(train_labels).mean(),
'test_size': len(y),
'test_frate': np.array(y).mean(),
'hyper_params': None,
'theta': None,
'gamma': None
}
= datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S')
ymdhms f'../results/{ymdhms}-pyod.csv',index=False)
df_results.to_csv(return df_results
0.5, 10) try_book(fraudTrain,
Generating walks (CPU: 1): 100%|██████████| 10/10 [01:29<00:00, 8.93s/it]
model | time | acc | pre | rec | f1 | auc | graph_based | method | throw_rate | train_size | train_cols | train_frate | test_size | test_frate | hyper_params |
---|
= throw(fraudTrain, 0.5) dfrate
= sklearn.model_selection.train_test_split(dfrate) df_tr, df_tst
= fraudTrain[::10]
dfn = dfn[~dfn.index.isin(df_tr.index)]
dfnn = dfnn.reset_index(drop=True)
dfnn = sklearn.model_selection.train_test_split(dfnn) df_trn, df_tstn
= concat(df_tr, df_tstn)
df2, mask 'index'] = df2.index
df2[= df2.reset_index()
df
= build_graph_tripartite(df, nx.Graph()) G_df
= train_test_split(list(range(len(G_df.edges))),
train_edges, test_edges, train_labels, y list(nx.get_edge_attributes(G_df, "label").values()),
=0.20,
test_size=42) random_state
= list(G_df.edges)
edgs = G_df.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_df.nodes) - set(train_graph.nodes)))
train_graph.add_nodes_from(
= Node2Vec(train_graph, weight_key='weight')
node2vec_train = node2vec_train.fit(window=10) model_train
Generating walks (CPU: 1): 100%|██████████| 10/10 [01:30<00:00, 9.05s/it]
= AverageEmbedder(keyed_vectors=model_train.wv) embeddings_train
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
test_embeddings
= RandomForestClassifier(n_estimators=1000, random_state=42)
rf rf.fit(train_embeddings, train_labels)
RandomForestClassifier(n_estimators=1000, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=1000, random_state=42)
= rf.predict(test_embeddings)
yhat = metrics.accuracy_score(y, yhat)
acc = metrics.precision_score(y, yhat)
pre = metrics.recall_score(y, yhat)
rec = metrics.f1_score(y, yhat)
f1 = metrics.roc_auc_score(y, yhat) auc
sum() yhat.
0
acc
0.8716776221777651
pre
0.0
rec
0.0
f1
0.0
auc
0.5
= {
result 'model': 'bipartite',
'time': None,
'acc': acc,
'pre': pre,
'rec': rec,
'f1': f1,
'auc': auc,
'graph_based': True,
'method': "Hadaembedder",
'throw_rate': df.is_fraud.mean(),
'train_size': len(train_labels),
'train_cols': 'amt',
'train_frate': np.array(train_labels).mean(),
'test_size': len(y),
'test_frate': np.array(y).mean(),
'hyper_params': None,
'theta': None,
'gamma': None
}
#classes = [HadamardEmbedder]#, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
#evaluation_results = {}
0.7, 10) try_book(fraudTrain,
def try_book_A(fraudTrain, fraudrate, n, prev_results=None):
if prev_results is None:
= pd.DataFrame(columns=[
df_results 'model', 'time', 'acc', 'pre', 'rec', 'f1', 'auc', 'graph_based',
'method', 'throw_rate', 'train_size', 'train_cols', 'train_frate',
'test_size', 'test_frate', 'hyper_params'
])else:
= prev_results
df_results
= throw(fraudTrain, fraudrate)
dfrate = sklearn.model_selection.train_test_split(dfrate)
df_tr, df_tst
= fraudTrain[::n]
dfn = dfn[~dfn.index.isin(df_tr.index)]
dfnn = dfnn.reset_index(drop=True)
dfnn = sklearn.model_selection.train_test_split(dfnn)
df_trn, df_tstn
= concat(df_tr, df_tstn)
df2, mask 'index'] = df2.index
df2[= df2.reset_index()
df
= build_graph_tripartite(df, nx.Graph())
G_df
= train_test_split(list(range(len(G_df.edges))),
train_edges, test_edges, train_labels, y list(nx.get_edge_attributes(G_df, "label").values()),
=0.20,
test_size=42)
random_state
= list(G_df.edges)
edgs = G_df.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_df.nodes) - set(train_graph.nodes)))
train_graph.add_nodes_from(
= Node2Vec(train_graph, weight_key='weight')
node2vec_train = node2vec_train.fit(window=10)
model_train
#classes = [HadamardEmbedder]#, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
#evaluation_results = {}
= AverageEmbedder(keyed_vectors=model_train.wv)
embeddings_train
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
test_embeddings
= RandomForestClassifier(n_estimators=1000, random_state=42)
rf
rf.fit(train_embeddings, train_labels)
= rf.predict(test_embeddings)
yhat = metrics.accuracy_score(y, yhat)
acc = metrics.precision_score(y, yhat)
pre = metrics.recall_score(y, yhat)
rec = metrics.f1_score(y, yhat)
f1 = metrics.roc_auc_score(y, yhat)
auc
= {
result 'model': 'bipartite',
'time': None,
'acc': acc,
'pre': pre,
'rec': rec,
'f1': f1,
'auc': auc,
'graph_based': True,
'method': 'AverageEmbedder',
'throw_rate': df.is_fraud.mean(),
'train_size': len(train_labels),
'train_cols': 'amt',
'train_frate': np.array(train_labels).mean(),
'test_size': len(y),
'test_frate': np.array(y).mean(),
'hyper_params': None,
'theta': None,
'gamma': None
}
= df_results.append(evaluation_results, ignore_index=True)
df_results
return df_results
def try_book_W1(fraudTrain, fraudrate, n, prev_results=None):
if prev_results is None:
= pd.DataFrame(columns=[
df_results 'model', 'time', 'acc', 'pre', 'rec', 'f1', 'auc', 'graph_based',
'method', 'throw_rate', 'train_size', 'train_cols', 'train_frate',
'test_size', 'test_frate', 'hyper_params'
])else:
= prev_results
df_results
= throw(fraudTrain, fraudrate)
dfrate = sklearn.model_selection.train_test_split(dfrate)
df_tr, df_tst
= fraudTrain[::n]
dfn = dfn[~dfn.index.isin(df_tr.index)]
dfnn = dfnn.reset_index(drop=True)
dfnn = sklearn.model_selection.train_test_split(dfnn)
df_trn, df_tstn
= concat(df_tr, df_tstn)
df2, mask 'index'] = df2.index
df2[= df2.reset_index()
df
= build_graph_tripartite(df, nx.Graph())
G_df
= train_test_split(list(range(len(G_df.edges))),
train_edges, test_edges, train_labels, y list(nx.get_edge_attributes(G_df, "label").values()),
=0.20,
test_size=42)
random_state
= list(G_df.edges)
edgs = G_df.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_df.nodes) - set(train_graph.nodes)))
train_graph.add_nodes_from(
= Node2Vec(train_graph, weight_key='weight')
node2vec_train = node2vec_train.fit(window=10)
model_train
#classes = [HadamardEmbedder]#, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
#evaluation_results = {}
= WeightedL1Embedder(keyed_vectors=model_train.wv)
embeddings_train
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
test_embeddings
= RandomForestClassifier(n_estimators=1000, random_state=42)
rf
rf.fit(train_embeddings, train_labels)
= rf.predict(test_embeddings)
yhat = metrics.accuracy_score(y, yhat)
acc = metrics.precision_score(y, yhat)
pre = metrics.recall_score(y, yhat)
rec = metrics.f1_score(y, yhat)
f1 = metrics.roc_auc_score(y, yhat)
auc
= {
result 'model': 'bipartite',
'time': None,
'acc': acc,
'pre': pre,
'rec': rec,
'f1': f1,
'auc': auc,
'graph_based': True,
'method': 'WeightedL1Embedder',
'throw_rate': df.is_fraud.mean(),
'train_size': len(train_labels),
'train_cols': 'amt',
'train_frate': np.array(train_labels).mean(),
'test_size': len(y),
'test_frate': np.array(y).mean(),
'hyper_params': None,
'theta': None,
'gamma': None
}
= df_results.append(evaluation_results, ignore_index=True)
df_results
return df_results
def try_book_W2(fraudTrain, fraudrate, n, prev_results=None):
if prev_results is None:
= pd.DataFrame(columns=[
df_results 'model', 'time', 'acc', 'pre', 'rec', 'f1', 'auc', 'graph_based',
'method', 'throw_rate', 'train_size', 'train_cols', 'train_frate',
'test_size', 'test_frate', 'hyper_params'
])else:
= prev_results
df_results
= throw(fraudTrain, fraudrate)
dfrate = sklearn.model_selection.train_test_split(dfrate)
df_tr, df_tst
= fraudTrain[::n]
dfn = dfn[~dfn.index.isin(df_tr.index)]
dfnn = dfnn.reset_index(drop=True)
dfnn = sklearn.model_selection.train_test_split(dfnn)
df_trn, df_tstn
= concat(df_tr, df_tstn)
df2, mask 'index'] = df2.index
df2[= df2.reset_index()
df
= build_graph_tripartite(df, nx.Graph())
G_df
= train_test_split(list(range(len(G_df.edges))),
train_edges, test_edges, train_labels, y list(nx.get_edge_attributes(G_df, "label").values()),
=0.20,
test_size=42)
random_state
= list(G_df.edges)
edgs = G_df.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_df.nodes) - set(train_graph.nodes)))
train_graph.add_nodes_from(
= Node2Vec(train_graph, weight_key='weight')
node2vec_train = node2vec_train.fit(window=10)
model_train
#classes = [HadamardEmbedder]#, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
#evaluation_results = {}
= WeightedL2Embedder(keyed_vectors=model_train.wv)
embeddings_train
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
test_embeddings
= RandomForestClassifier(n_estimators=1000, random_state=42)
rf
rf.fit(train_embeddings, train_labels)
= rf.predict(test_embeddings)
yhat = metrics.accuracy_score(y, yhat)
acc = metrics.precision_score(y, yhat)
pre = metrics.recall_score(y, yhat)
rec = metrics.f1_score(y, yhat)
f1 = metrics.roc_auc_score(y, yhat)
auc
= {
result 'model': 'bipartite',
'time': None,
'acc': acc,
'pre': pre,
'rec': rec,
'f1': f1,
'auc': auc,
'graph_based': True,
'method': 'WeightedL2Embedder',
'throw_rate': df.is_fraud.mean(),
'train_size': len(train_labels),
'train_cols': 'amt',
'train_frate': np.array(train_labels).mean(),
'test_size': len(y),
'test_frate': np.array(y).mean(),
'hyper_params': None,
'theta': None,
'gamma': None
}
= df_results.append(evaluation_results, ignore_index=True)
df_results
return df_results